In [1]:
import numpy as np
import pandas as pd
In [2]:
info = pd.read_csv('../data/Demultiplex_Sheet.txt', sep='\t')
In [3]:
info = info.sort_values(['#SampleID'])
info.head()
Out[3]:
In [4]:
info = info.drop_duplicates(subset=['#SampleID', 'ReversePrimer'])
# save unduplicated data
info.to_csv('unduplicated_demultiplex.txt', sep= '\t', index=False)
In [5]:
info.head()
Out[5]:
In [6]:
info.shape[0]
Out[6]:
In [7]:
n = info.shape[0]
data = {'Sample_Name': info.loc[:,'#SampleID'].str[7:13].values, # .values takes only value not index
'Barcode': info.loc[:,'BarcodeSequence'].values,
'LinkerPrimerSequence': info.loc[:, 'LinkerPrimerSequence'].values,
'Description': info.loc[:,'Description'].values,
'Experiment_Design_Description': np.repeat(
'16S stool samples sequenced for MrOS Vitamin D study', n),
'Library_Construction_Protocol': np.repeat('16S rRNA v4', n),
'Linker': np.repeat('GT', n),
'Platform': np.repeat('Illumina', n),
'Center_Name': info.loc[:,'#SampleID'].str[14:16].values,
'Center_Project': np.repeat('MrOS', n),
'Instrument_Model': np.repeat('Illumina MiSeq', n)}
prep_info = pd.DataFrame(data, columns=['Sample_Name', 'Barcode', 'LinkerPrimerSequence',
'Description',
'Experiment_Design_Description',
'Library_Construction_Protocol', 'Linker', 'Platform',
'Center_Name', 'Center_Project', 'Instrument_Model'])
In [8]:
prep_info.shape
Out[8]:
In [9]:
prep_info.head(10)
Out[9]:
In [10]:
# fix mismatch sample name 'BI0778' should be 'BIO778' (Nora's email on08/07/2017)
prep_info = prep_info.replace(to_replace='BIO778', value='BI0778')
In [2]:
samples_part1 = pd.read_csv('../data/VitDMetadata_update.csv', sep=',')
samples_part2 = pd.read_csv('../data/Other32metadata.csv', sep=',')
In [3]:
print(samples_part1.shape)
print(samples_part2.shape)
In [4]:
print(samples_part1.columns)
print(samples_part2.columns)
In [5]:
# merge two metadata files
samples = pd.concat([samples_part1, samples_part2], ignore_index=True)
In [6]:
samples.shape
Out[6]:
In [7]:
#samples.M1ANTIB.value_counts()
Out[7]:
In [16]:
len(pd.unique(samples.ID))
Out[16]:
In [17]:
samples.head(10)
Out[17]:
In [18]:
sites = samples.loc[:, 'SITE']
In [19]:
# reference: http://www.latlong.net/
Latitude = []
Longitude = []
sites = samples.loc[:, 'SITE']
for i in range(samples.shape[0]):
if sites[i] == 'Birmingham':
latitude = '33.520661'
longitude = '-86.80249'
elif sites[i] == 'San Diego':
latitude = '32.715738'
longitude = '-117.1611'
elif sites[i] == 'Pittsburgh':
latitude = '40.440625'
longitude = '-79.99589'
elif sites[i] == 'Palo Alto':
latitude = '37.441883'
longitude = '-122.143'
elif sites[i] == 'Portland':
latitude = '45.523062'
longitude = '-122.6765'
elif sites[i] == 'Minneapolis':
latitude = '44.977753'
longitude = '-93.26501'
Latitude.append(latitude)
Longitude.append(longitude)
In [20]:
# simple check
print(samples['SITE'][[1,90,200,300,400, 500]])
print(np.array(Latitude)[[1,90,200,300,400, 500]])
print(np.array(Longitude)[[1,90,200,300,400, 500]])
In [21]:
m = samples.shape[0]
required = {'Sample_Name': samples.loc[:,'ID'].values,
'Title': np.repeat('MrOS_VitaminD', m),
'Anonymized_Name': samples.loc[:,'ID'].values,
'Scientific_Name': np.repeat('human gut metagenome', m),
'Taxon_ID': np.repeat('Not applicable', m),
#'Description': np.repeat('Not applicable', m),
'Sample_Type': np.repeat('stool', m),
'Geo_Loc_Name': samples.loc[:, 'SITE'].values,
'Elevation': np.repeat('Not applicable', m),
'Env_Biome': np.repeat('urban biome', m),
'Env_Feature': np.repeat('human-associated habitat', m),
'Env_Material': np.repeat('feces', m),
'Env_Package': np.repeat('human-gut', m),
'Latitude': Latitude,
'Longitude': Longitude,
'Collection_Timestamp': np.repeat('Not applicable', m),
'DNA_Extracted': np.repeat('Not applicable', m),
'Physical_Specimen_Location': np.repeat('Not applicable', m),
'Physical_Specimen_Remaining': np.repeat('Not applicable', m),
'Age': samples.loc[:,'V4AGE1'].values,
'Age_Units': np.repeat('years', m),
'Host_Subject_ID': np.repeat('Not applicable', m),
'Host_Taxid': np.repeat('Not applicable', m),
'Host_Scientific_Name': np.repeat('Homo sapiens', m),
'Host_Common_Name': np.repeat('human', m),
'Life_Stage': np.repeat('adult', m),
'Sex': np.repeat('male', m),
'Height': samples.loc[:, 'HWHGT'].values,
'Height_Units': np.repeat('cm', m),
'Weight': samples.loc[:, 'HWWGT'].values,
'Weight_Units': np.repeat('kg', m),
'BMI': samples.loc[:, 'HWBMI'].values,
'Body_Habitat': np.repeat('UBERON:feces', m),
'Body_Site': np.repeat('UBERON:feces', m),
'Body_Product': np.repeat('UBERON:feces', m)}
In [22]:
sample_info = pd.concat([pd.DataFrame(required), samples], axis=1)
sample_info = pd.DataFrame(sample_info, columns = ['Sample_Name', 'Title', 'Anonymized_Name',
'Scientific_Name', 'Taxon_ID',
#'Description',
'Sample_Type', 'Geo_Loc_Name', 'Elevation',
'Env_Biome', 'Env_Feature', 'Env_Material', 'Env_Package', 'Latitude', 'Longitude',
'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location', 'Physical_Specimen_Remaining',
'Age', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid', 'Host_Scientific_Name', 'Host_Common_Name',
'Life_Stage', 'Sex', 'Height', 'Height_Units', 'Weight', 'Weight_Units', 'BMI', 'Body_Habitat',
'Body_Site', 'Body_Product', 'GIERACE', 'SITE', 'TUDRAMT', 'PASCORE', 'TURSMOKE', 'DTVITD',
'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3',
'OHVD3', 'OHVD2', 'OHV1D2', 'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'])
In [23]:
sample_info.shape
Out[23]:
In [24]:
sample_info.head(10)
Out[24]:
In [25]:
id_prep=list(prep_info.loc[:,'Sample_Name'].values)
id_sample=list(sample_info.loc[:,'Sample_Name'].values)
In [26]:
# samples with metadata yet no sequencing data
for i in id_sample:
if i not in id_prep:
print(i)
In [27]:
# samples with sequencing data yet no metadata
no_sample_info = []
for j in id_prep:
if j not in id_sample:
no_sample_info.append(j)
In [28]:
print(len(no_sample_info))
print(no_sample_info)
In [29]:
# exclude PO7100, as it has no microbiome data (see Lily Liu's email on 08/07/2017)
prep_info = prep_info.loc[~prep_info['Sample_Name'].isin(no_sample_info)]
In [30]:
prep_info.shape
Out[30]:
In [31]:
sample_info.to_csv('../data/sample_MrOS.txt', sep= '\t', na_rep='Missing:not collected', index=False)
prep_info.to_csv('../data/prep_MrOS.txt', sep= '\t', index=False)
In [32]:
prep_info.head(5)
Out[32]:
In [33]:
mapping = pd.merge(prep_info, sample_info, on='Sample_Name')
In [34]:
print(prep_info.shape)
print(sample_info.shape)
print(mapping.shape)
In [35]:
mapping.rename(columns={'Sample_Name': '#SampleID'}, inplace=True)
mapping.rename(columns={'Barcode': 'BarcodeSequence'}, inplace=True)
In [36]:
mapping.shape
Out[36]:
In [37]:
mapping.head(5)
Out[37]:
In [38]:
# move 'Description' to the end
cols = list(mapping)
cols.insert(mapping.shape[1], cols.pop(cols.index('Description')))
mapping = mapping.loc[:, cols]
In [39]:
mapping.head(5)
Out[39]:
In [40]:
mapping.to_csv('../data/mapping_MrOS.txt', sep= '\t', index=False)
In [ ]: